Highlighting collaborations in SNSF grants running in 2017.

Data

p3url <- c(
  "http://p3.snf.ch/P3Export/P3_GrantExport.csv",
  "http://p3.snf.ch/P3Export/P3_PersonExport.csv")
# "http://p3.snf.ch/P3Export/P3_CollaborationExport.csv")

purrr::walk2(p3url, saveas, download.file, quiet = TRUE)

You can download up-to-date SNSF grant data here. Before reading the datasets in R, however, I recommend using MS Excel (or Libre/Open Office) to fix csv parsing problems.

grants.csv and people.csv correspond to P3_GrantExport.csv and P3_PersonExport.csv, respectively. I made sure they read faultlessly and cleaned their headers with the clean_names() function from the janitor package.

Grants connecting Switzerland

We want to highlight 2017 grants connecting institutes from different parts of Switzerland and of the world.

p_load("lubridate", "stringr")
grants <- fread("grants.csv")
people <- fread("people.csv")

# focus on a time range
trange <- ymd(c(str_c(YEAR, "-01-01"), str_c(YEAR, "-12-31")))

# find grants running in that time range
grants <- grants %>%
  select(
    project_number, discipline_number, start_date, end_date) %>%
  mutate(
    start_date = ymd(str_sub(start_date, 1, 10)),
    end_date = ymd(str_sub(end_date, 1, 10)),
    domain = as.integer(str_extract_all(discipline_number, "^[0-9]"))) %>%
  filter(start_date <= trange[2], end_date >= trange[1])

head(grants)
  project_number discipline_number start_date   end_date domain
1         128565             10105 2011-03-01 2018-02-28      1
2         129572             10105 2011-01-01 2017-02-28      1
3         133815             20404 2012-04-01 2017-03-31      2
4         135192             30720 2012-07-01 2018-04-30      3
5         135970             10104 2014-01-01 2017-12-31      1
6         136707             20507 2012-02-01 2017-05-31      2
# find the people involved in those grants
people <- people %>%
  select(person_id_snsf, institute_place, starts_with("projects")) %>%
  filter(institute_place != "") %>% 
  select(-projects_as_responsible_applicant) %>%
  unite(project_number, starts_with("projects"), sep = ";") %>%
  mutate(
    institute_place = str_replace_all(institute_place, " Cedex(.*)?| [0-9]{1,2}", ""),
    project_number = str_replace_all(project_number, "[;]+", ";"),
    project_number = str_replace_all(project_number, "^[;]|[;]$|NA", ""),
    project_number = str_split(project_number, ";")) %>% 
  unnest() %>%
  mutate(project_number = as.integer(project_number)) %>%
  semi_join(grants, by = "project_number")

head(people)
  person_id_snsf institute_place project_number
1         587432       Stavanger         171200
2         702008        Fribourg         175960
3         687469           Basel         170809
4         642410        Lausanne         153952
5         642410        Lausanne         153990
6          54956          Tirana         152346

The collaboration network

To show the network in a map, we focus on grants connecting people in different places.

# how many distinct places per grant?
places_per_grant <- group_by(people, project_number) %>%
  summarise(n_places = n_distinct(institute_place))

# network core: projects involving multiple places (n_places > 1)
core <- people %>%
  select(-person_id_snsf) %>%
  semi_join(filter(places_per_grant, n_places > 1), by = "project_number") %>%
  arrange(project_number) %>%
  distinct()

head(core)
  institute_place project_number
1        Konstanz         128565
2            Bern         128565
3          Berlin         129572
4           Basel         129572
5      St. Gallen         135970
6         Rostock         135970

We also need the geocodes of all the places of the network. You find more information on how to get geocode data in R here and here. In essence, I use two APIs: openstreetmaps and googe maps as fallback. To save the effort of writing html requests, I use dedicated R packages to query map data. To speed up the analysis, we will store the geocodes in geocodes.csv.

# initialize geocodes as a tibble
geocodes <- tibble(
  place = sort(unique(core$institute_place)),
  lat = rep(NA_real_, n_distinct(core$institute_place)),
  lon = rep(NA_real_, n_distinct(core$institute_place)),
  addr = rep(NA_character_, n_distinct(core$institute_place)),
  id = rep(NA_character_, n_distinct(core$institute_place)))

p_load_gh("hrbrmstr/nominatim", "ggmap") # ggmap requires `libpng16-dev` in ubuntu
osm_key <- readLines("mapquest.key")

# openstreetmap api
osm <- function(query, osm_key) {
  r <- osm_search_spatial(query, limit = 1, key = osm_key)
  if (!is.null(r[[1]])) {
    c(r[[1]]$place_id, r[[1]]$display_name, r[[1]]$lat, r[[1]]$lon)
  } else return(NA)
}

# googlemaps api (2500 reqs/day, 50 reqs/sec max)
google <- function(query) {
  r <- geocode(query, output = "all")
  if (r$status == "OK") {
    c(r$results[[1]]$place_id, r$results[[1]]$formatted_address,
      r$results[[1]]$geometry$location$lat, r$results[[1]]$geometry$location$lng)
  } else return(NA)
}

for (k in seq_along(geocodes$place)) {
  cat("   ........ ", k, ": ", geocodes$place[k], "\n")
  info <- osm(geocodes$place[k], osm_key)
  cat("   osm info: ", info, "\n")
  # fallback
  if (is.na(info)) {
    info <- google(geocodes$place[k])
    cat("    google: ", info, "\n")
  }
  # store info in meta
  if (!is.na(info)) {
    geocodes$id[k] = info[1]
    geocodes$addr[k] = info[2]
    geocodes$lat[k] = as.numeric(info[3])
    geocodes$lon[k] = as.numeric(info[4])
  }
}
# it works despite of some warnings...
write_csv(geocodes, path = "geocodes.csv")
geocodes <- fread("geocodes.csv") %>% drop_na()
head(geocodes)
              place       lat       lon
1: 69120 Heidelberg 49.416449  8.688807
2:            Aarau 47.392715  8.044445
3:         Aarhus C 56.149628 10.213405
4:         Aberdeen 57.145245 -2.091375
5:          Abidjan  5.409118 -4.042210
6:    Abomey-Calavi  6.415369  2.306228
                                                                                   addr
1: Neuenheim, Heidelberg, Regierungsbezirk Karlsruhe, Baden-Württemberg, 69120, Germany
2:                                       Aarau, Bezirk Aarau, Aargau, 5000, Switzerland
3:                         Aarhus, Aarhus Municipality, Central Denmark Region, Denmark
4:                                    Aberdeen, Aberdeenshire, Scotland, United Kingdom
5:                                                               Abidjan, Côte d'Ivoire
6:                                                     Abomey-Calavi, Atlantique, Benin
          id
1: 151274321
2: 151329375
3:    133615
4:    113817
5: 151537515
6: 151484207
# add geocode data to the network core
core <- left_join(core, geocodes, by = c("institute_place" = "place"))

Swiss collaboration network

Lets show the Swiss network first.
The visualization is heavily inspired from here and/or here.

core_ch <- filter(core, str_detect(addr, "Switzerland$"))
# find projects on more than one place in Switzerland
ch_per_grant <- group_by(core_ch, project_number) %>%
  summarise(n_places = n_distinct(institute_place))
# swiss core: projects involving multiple places in Switzerland
core_ch <- semi_join(core_ch, filter(ch_per_grant, n_places > 1), by = "project_number")

# nodes (places) ----------------------------------------------------------
nodes_ch <- group_by(core_ch, institute_place) %>%
  summarise(y = head(lat, 1), x = head(lon, 1), size = n()) %>%
  arrange(desc(size))
head(nodes_ch)
write_csv(nodes_ch, path = "nodes_ch.csv")

p_load("magrittr")
project_edges <- function(number, df) {
  links <- filter(df, project_number == number) %>%
    select(institute_place) %$%
    combn(sort(institute_place), m = 2)
  tibble(from = links[1, ],
         to = links[2,]) %>%
    mutate(number = number) %>%
    select(number, everything())
}

# edges (grants) ----------------------------------------------------------
edges_ch <- purrr::map(unique(core_ch$project_number), project_edges, df = core_ch)
edges_ch <- do.call(bind_rows, edges_ch)
# add geocode metadata and group
edges_ch <- edges_ch %>%
  left_join(select(nodes_ch, -size), by = c("from" = "institute_place")) %>%
  rename(x1 = x, y1 = y) %>%
  left_join(select(nodes_ch, -size), by = c("to" = "institute_place")) %>%
  rename(x2 = x, y2 = y) %>%
  group_by(from, to) %>%
  summarise(
    x1 = head(x1, 1),
    y1 = head(y1, 1),
    x2 = head(x2, 1),
    y2 = head(y2, 1),
    strength = n())
head(edges_ch)
write_csv(edges_ch, path = "edges_ch.csv")
# plot the nodes and the edges on the map of Switzerland
p_load("maps", "mapdata", "geosphere")
nodes_ch <- fread("nodes_ch.csv")
edges_ch <- fread("edges_ch.csv")

col_dark <- adjustcolor("#252525", alpha = 0.05)
col_light <- adjustcolor("#08306b", alpha = 0.4)
edge_pal <- colorRampPalette(c(col_dark, col_light), alpha = TRUE)
edge_col <- edge_pal(100)

svg("core_ch.svg", width = 12, height = 7)
par(mar = c(0, 0, 0, 0))
maps::map(database = "worldHires", regions = "Switzerland",
          fill = FALSE, col = rgb(0, 0, 0, .2))

# map nodes
points(x = nodes_ch$x, y = nodes_ch$y, pch = 16,
  cex = log(nodes_ch$size)/3, col = rgb(0, 0, 0, .2))
# map edges
for (k in 1:nrow(edges_ch))  {
  arc <- gcIntermediate(
    c(edges_ch$x1[k], edges_ch$y1[k]),
    c(edges_ch$x2[k], edges_ch$y2[k]),
    n = 100, addStartEnd = TRUE)
  strength <- round(100 * sqrt(edges_ch$strength[k]) / max(sqrt(edges_ch$strength)))
  lines(arc, col = edge_col[strength], lwd = strength / 10)
}

# label main cities
cities <- c("Zürich", "Lausanne", "Bern", "Genève", 
            "Basel", "Fribourg", "Neuchâtel", "St. Gallen", 
            "Lugano", "Luzern", "Winterthur")

city_nodes <- filter(nodes_ch, institute_place %in% cities)
graphics::text(
  city_nodes$x, city_nodes$y, 
  labels = city_nodes$institute_place, pos = 3, cex = .7, col = rgb(0, 0, 0, .66))
invisible(dev.off())
switzerland

switzerland

International collaboration network

Same as above, but for the rest of the world.

nodes_io <- group_by(core, institute_place) %>%
  summarise(y = head(lat, 1), x = head(lon, 1), size = n()) %>%
  arrange(desc(size))
write_csv(nodes_io, path = "nodes_io.csv")

edges_io <- purrr::map(unique(core$project_number), project_edges, df = core)
edges_io <- do.call(bind_rows, edges_io)

# add geocode metadata and group
edges_io <- edges_io %>%
  left_join(select(nodes_io, -size), by = c("from" = "institute_place")) %>%
  rename(x1 = x, y1 = y) %>%
  left_join(select(nodes_io, -size), by = c("to" = "institute_place")) %>%
  rename(x2 = x, y2 = y) %>%
  group_by(from, to) %>%
  summarise(
    x1 = head(x1, 1),
    y1 = head(y1, 1),
    x2 = head(x2, 1),
    y2 = head(y2, 1),
    strength = n())
write_csv(edges_io, path = "edges_io.csv")
# plot the nodes and the edges on the world ma
nodes_io <- fread("nodes_io.csv")
edges_io <- fread("edges_io.csv")

nodes_io <- anti_join(nodes_io, nodes_ch, by = "institute_place")
edges_io <- edges_io %>%
  anti_join(edges_ch, by = c("from" = "from", "to" = "to")) %>%
  drop_na()

col_dark <- adjustcolor("#252525", alpha = 0.01)
col_light <- adjustcolor("#08306b", alpha = 0.2)
edge_pal <- colorRampPalette(c(col_dark, col_light), alpha = TRUE)
edge_col <- edge_pal(100)


svg("core.svg", width = 12, height = 7)
par(mar = c(0, 0, 0, 0))
maps::map("world", fill = FALSE, col = rgb(0, 0, 0, .2))
# map nodes
points(x = nodes_io$x, y = nodes_io$y, pch = 16,
  cex = log(nodes_io$size)/3, col = rgb(0, 0, 0, .2))

# map edges
for (k in 1:nrow(edges_io))  {
  arc <- gcIntermediate(
    c(edges_io$x1[k], edges_io$y1[k]),
    c(edges_io$x2[k], edges_io$y2[k]),
    n = 100, addStartEnd = TRUE)
  strength <- round(100 * sqrt(edges_io$strength[k]) / max(sqrt(edges_io$strength)))
  lines(arc, col = edge_col[strength], lwd = strength / 50)
}

invisible(dev.off())
world

world

About

This page uses the html_cleam template from the prettydoc package.